from htmldocx import HtmlToDocx

from pathlib import Path
import glob


# 说明： 需要安装htmldocx库

def one_html_to_docx(html_file, to_docx_file, parser=HtmlToDocx()):
    try:
        parser.parse_html_file(html_file, to_docx_file)
    except Exception as e:
        print(f"转换{html_file}出错，", e)


def all_html_to_docx(html_dir, docs_dir):
    for html_file in glob.glob(rf"{html_dir}\**\*.html", recursive=True):
        # print(html_file)
        to_docx_file = html_file.replace(html_dir, docs_dir).replace(".html", "")
        # print(to_docx_file)
        if Path(f'{to_docx_file}.docx').exists():
            continue
        one_html_to_docx(html_file, to_docx_file)


if __name__ == '__main__':
    # html_file = r"D:\dl\htmls\2653.html"
    # to_docx_file = r"D:\dl\docs\2653"
    # one_html_to_docx(html_file, to_docx_file)
    html_dir = r"D:\dl\htmls"
    docs_dir = r"D:\dl\docs"
    all_html_to_docx(html_dir, docs_dir)
